Introduction

We are examining the extent to which individuals’ subjective well-being can be predicted by objective factors that determine individual health decline and for which group this declaration bias is particularly large. And in that sense, which predictors (such as age, labor market status, personality traits and hand-grip strength) help improve predictions of individual health declines.

This report documents the steps taken to preprocess and analyze the SOEP data for health-related outcomes: subjective and objective health measures, predictors, and control variables. The transformations include cleaning, imputing missing values, and deriving new variables.


1. Setup

1.1 Load Required Packages

# Load required packages
my_packages <- c("tidyverse", "haven", "openxlsx", "labelled", "dplyr", "lubridate","ggplot2","skimr")

for (p in my_packages) {
  if (!require(p, character.only = TRUE)) {
    install.packages(p, dependencies = TRUE)
  }
  library(p, character.only = TRUE)
}
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: haven
## 
## Loading required package: openxlsx
## 
## Loading required package: labelled
## 
## Loading required package: skimr

1.2 Load the pl Dataset

main <- read_dta('/Users/salmahoumane/Documents/R/pl.dta') %>%
  select(
    "pid", "syear", "ple0008", "plh0035", "ple0055", "ple0056", "ple0072", "ple0081_h", "ple0086_v4",
    "ple0080_v2", "ple0086_v1", "ple0086_v4", paste0("ple00", 11:23),
    "ple0004", "ple0005", "ple0095", "plh0182", "plh0171",
    "plh0042", "plh0033", "plh0032", "ple0010_h", "ple0006", "ple0007"
  )
cat("Main dataset dimensions:", dim(main), "\n")
## Main dataset dimensions: 377869 35

1.3 Merge with Other Datasets

# Merge with `pgen` dataset
data_pgen <- read_dta('/Users/salmahoumane/Documents/R/pgen.dta') %>%
  select("pid", "syear", "pgerwzeit", "pgemplst", "pglabnet", "pgtatzeit", "pgexpue", "pgstib","pgfamstd")
cat("PGEN dataset dimensions:", dim(data_pgen), "\n")
## PGEN dataset dimensions: 381814 9
main_before_merge <- dim(main)  # Store dimensions before merge
main <- merge(main, data_pgen, by = c("pid", "syear"), all.x = TRUE)
cat("After merging with pgen: Rows =", nrow(main), "Cols =", ncol(main), "\n")
## After merging with pgen: Rows = 377869 Cols = 42
cat("Same number of rows then the main dataset with 7 new rows -> Merge successful")
## Same number of rows then the main dataset with 7 new rows -> Merge successful
# Check for duplicate rows introduced during merge
duplicates <- main %>%
  group_by(pid, syear) %>%
  summarise(count = n()) %>%
  filter(count > 1)
## `summarise()` has grouped output by 'pid'. You can override using the `.groups`
## argument.
if (nrow(duplicates) > 0) {
  cat("Warning: Duplicates detected after merging with pgen!\n")
  print(duplicates)
} else {
  cat("No duplicates found after merging with pgen.\n")
}
## No duplicates found after merging with pgen.
# Merge with `ppathl` dataset
data_ppathl <- read_dta('/Users/salmahoumane/Documents/R/ppathl.dta') %>%
  select("pid", "hid", "syear", "psample", "gebjahr", "sex", "migback", "germborn")
cat("PPATHL dataset dimensions:", dim(data_ppathl), "\n")
## PPATHL dataset dimensions: 634864 8
main_before_ppathl <- dim(main)  # Store dimensions before second merge

main <- left_join(main, data_ppathl, by = c("pid", "syear"))
cat("After merging with ppathl: Rows =", nrow(main), "Cols =", ncol(main), "\n")
## After merging with ppathl: Rows = 377869 Cols = 48
cat("Same number of rows then the main dataset with 6 new rows -> Merge successful")
## Same number of rows then the main dataset with 6 new rows -> Merge successful
# Final snapshot of dimensions
cat("Final dataset dimensions after all merges:", dim(main), "\n")
## Final dataset dimensions after all merges: 377869 48

1.4 Main filtering: Analysis post 1999

# Drop observations prior to 1999
main <- main %>%
  filter(syear >= 1999)

2. Outcome Variables: Subjective Health

2.1 Health

# Clean and rename the `health` variable
main <- main %>%
  rename(health = ple0008) %>%
  filter(health > 0) %>%  # Filter valid health values
  mutate(health = 6 - health)  # Reverse the scale: 1 -> 5, 5 -> 1

# Visualisation: Bar plot
ggplot(main, aes(x = factor(health))) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Distribution of Self-Assessed Health",
    x = "Health Category (1 = Poor, 5 = Excellent)",
    y = "Count"
  ) +
  theme_minimal()

2.2 Worries About Health

# Create `worr_health_categorical` and `worr_health_dummy`
# Calculate the number of missing values in `plh0035` (worries about health) for each year
missing_counts <- main %>%
  group_by(syear) %>%
  summarise(missing_count = sum(plh0035 < 0, na.rm = TRUE))
# Visualization
print(missing_counts)
## # A tibble: 23 × 2
##    syear     missing_count
##    <dbl+lbl>         <int>
##  1 1999                 14
##  2 2000                 78
##  3 2001                 85
##  4 2002                 58
##  5 2003                 45
##  6 2004                 57
##  7 2005                 49
##  8 2006                 65
##  9 2007                 57
## 10 2008                 35
## # ℹ 13 more rows
## There are a lot of missing in years 2011, 2012 and 2013. Not to lose too many information, we will impute these values within each person with the mean of 2010 and 2014, other missings will be dropped.
main <- main %>%
  rename(worr_health_categorical = plh0035) %>%
  mutate(
    worr_health_categorical = case_when(
      worr_health_categorical < 0 ~ NA_real_,
      TRUE ~ worr_health_categorical
    )
  ) %>%
  group_by(pid) %>%
  mutate(
    worr_health_categorical = if_else(
      syear %in% c(2011, 2012, 2013) & is.na(worr_health_categorical),
      floor(
        mean(
          worr_health_categorical[syear %in% c(2010, 2014)],
          na.rm = TRUE
        )
      ),
      worr_health_categorical
    )
  ) %>%
  ungroup() %>%
  drop_na(worr_health_categorical) %>%
  mutate(
    worr_health_categorical = factor(
      worr_health_categorical, levels = c(3, 2, 1),
      labels = c("no worries", "worries a little", "worries a lot")
    ),
    worr_health_dummy = if_else(
      worr_health_categorical %in% c("worries a lot", "worries a little"), 1, 0
    )
  ) %>%
  # Drop any rows with NA in worr_health_categorical
  drop_na(worr_health_categorical)
# Visualisation: Bar plot (categorical)
ggplot(main, aes(x = worr_health_categorical)) +
  geom_bar(fill = "coral") +
  labs(
    title = "Distribution of Worries About Health",
    x = "Worries About Health",
    y = "Count"
  ) +
  theme_minimal()

# Visualisation: Pie Char (dummy)
main %>%
  group_by(worr_health_dummy) %>%
  summarize(count = n()) %>%
  mutate(
    proportion = count / sum(count),
    label = paste0(round(proportion * 100, 1), "%")
  ) %>%
  ggplot(aes(x = "", y = proportion, fill = factor(worr_health_dummy))) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(
    title = "Proportion of Health Worry (Dummy)",
    fill = "Health Worry (Dummy)"
  ) +
  theme_minimal() +
  geom_text(aes(label = label), position = position_stack(vjust = 0.5))

3. Outcome Variables: Objective Health

3.1 Grip Strength: it has the variable calculated twice for each hand for every other year between the years 2006-2018, we use gs01 which tells us about the dominant hand (1 is right, 2 is left, 3 is both)

# Process grip strength data and merge with main
gripstr <- read_dta("/Users/salmahoumane/Documents/R/gripstr.dta") %>%
  select(pid, syear, gs01, gs03, gs04, gs05, gs06) %>%
  rename(
    gripr1 = gs03, gripr2 = gs04,
    gripl1 = gs05, gripl2 = gs06
  ) %>%
  filter(gs01 %in% c(1, 2, 3)) %>%
  mutate(across(starts_with("grip"), ~ ifelse(. < 0, NA_real_, .))) %>%
  mutate(
    avg_gripr = rowMeans(cbind(gripr1, gripr2), na.rm = TRUE),
    avg_gripl = rowMeans(cbind(gripl1, gripl2), na.rm = TRUE),
    avg_grip = case_when(
      gs01 == 1 ~ avg_gripr,
      gs01 == 2 ~ avg_gripl,
      gs01 == 3 ~ rowMeans(cbind(avg_gripr, avg_gripl), na.rm = TRUE),
      TRUE ~ NA_real_
    )
  )

main <- main %>%
  left_join(gripstr, by = c("pid", "syear"))

# Calculate the number of missing values in grip strength columns for each year
missing_counts_gripstr <- main %>%
  group_by(syear) %>%
  summarise(missing_avg_grip = sum(is.na(avg_grip)))

# Display the results
print(missing_counts_gripstr)
## # A tibble: 23 × 2
##    syear     missing_avg_grip
##    <dbl+lbl>            <int>
##  1 1999                  6890
##  2 2000                 12092
##  3 2001                 10989
##  4 2002                 11805
##  5 2003                 11174
##  6 2004                 10857
##  7 2005                 10371
##  8 2006                  8621
##  9 2007                 10353
## 10 2008                  7284
## # ℹ 13 more rows
# Visualisation: Histogram 
ggplot(main, aes(x = avg_grip)) +
  geom_histogram(binwidth = 20, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(
    title = "Distribution of Average Grip Strength",
    x = "Average Grip Strength",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 264444 rows containing non-finite outside the scale range
## (`stat_bin()`).

3.2 Hospital Stay

# Create `hospital_stay` variable
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    ple0055_NA = sum(is.na(ple0055)),
    ple0055_not_asked = sum(ple0055 %in% c(-8, -5), na.rm = TRUE),
    ple0055_other_neg = sum(ple0055 < 0 & !(ple0055 %in% c(-8, -5)), na.rm = TRUE),
    ple0056_NA = sum(is.na(ple0056)),
    ple0056_not_asked = sum(ple0056 %in% c(-8, -5), na.rm = TRUE),
    ple0056_other_neg = sum(ple0056 < 0 & !(ple0056 %in% c(-8, -5)), na.rm = TRUE)
  )

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 6
##   ple0055_NA ple0055_not_asked ple0055_other_neg ple0056_NA ple0056_not_asked
##        <int>             <int>             <int>      <int>             <int>
## 1          0             16281            235480          0             16281
## # ℹ 1 more variable: ple0056_other_neg <int>
# Step 2: Impute Values for ple0055 and ple0056 (Capping Lag/Lead to 2 Years)
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    ple0055_flag = ifelse(ple0055 %in% c(-8, -5), 1, 0),  # Flag imputations
    ple0056_flag = ifelse(ple0056 %in% c(-8, -5), 1, 0),
    
    ple0055 = case_when(
      ple0055 %in% c(-8, -5) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0055), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0055), NA_real_)
        coalesce(lag_val, lead_val)
      },
      TRUE ~ ple0055
    ),
    ple0056 = case_when(
      ple0056 %in% c(-8, -5) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0056), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0056), NA_real_)
        coalesce(lag_val, lead_val)
      },
      TRUE ~ ple0056
    ),
    
    imputed_flag = ifelse(ple0055_flag == 1 | ple0056_flag == 1, 1, 0)  # Combined flag
  ) %>%
  ungroup()

# Step 3: Filter Valid Rows and Calculate Hospital Stay
main <- main %>%
  filter(!is.na(ple0055) & !is.na(ple0056)) %>%
  mutate(hospital_stay = ple0055 + ple0056) %>%
  filter(hospital_stay >= 0)

# Step 4: Imputed and Non-Imputed Statistics
# Imputed Rows
imputed_stats <- main %>%
  filter(imputed_flag == 1) %>%
  summarise(
    Mean = mean(hospital_stay, na.rm = TRUE),
    Median = median(hospital_stay, na.rm = TRUE),
    Q1 = quantile(hospital_stay, 0.25, na.rm = TRUE),
    Q3 = quantile(hospital_stay, 0.75, na.rm = TRUE),
    Min = min(hospital_stay, na.rm = TRUE),
    Max = max(hospital_stay, na.rm = TRUE),
    Count = n()
  )

# Non-Imputed Rows
non_imputed_stats <- main %>%
  filter(imputed_flag == 0) %>%
  summarise(
    Mean = mean(hospital_stay, na.rm = TRUE),
    Median = median(hospital_stay, na.rm = TRUE),
    Q1 = quantile(hospital_stay, 0.25, na.rm = TRUE),
    Q3 = quantile(hospital_stay, 0.75, na.rm = TRUE),
    Min = min(hospital_stay, na.rm = TRUE),
    Max = max(hospital_stay, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 7
##    Mean Median    Q1    Q3   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1  8.73      6     4     9     0   110   249
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 7
##    Mean Median    Q1    Q3   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1  13.2      8     4    15     0   351 33900
# Step 5: Recheck Total Imputed Counts
imputation_counts <- main %>%
  summarise(
    Total_Imputed_Rows = sum(imputed_flag),
    Total_Rows = n()
  )
print("Imputation Counts:")
## [1] "Imputation Counts:"
print(imputation_counts)
## # A tibble: 1 × 2
##   Total_Imputed_Rows Total_Rows
##                <dbl>      <int>
## 1                249      34149
# Step 6: Visualizations
# All rows
ggplot(main, aes(x = hospital_stay)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Hospital Stays - All Rows", x = "Days in Hospital", y = "Count")

# Imputed rows
ggplot(main %>% filter(imputed_flag == 1), aes(x = hospital_stay)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(title = "Hospital Stays - Imputed Rows", x = "Days in Hospital", y = "Count")

# Non-Imputed rows
ggplot(main %>% filter(imputed_flag == 0), aes(x = hospital_stay)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  labs(title = "Hospital Stays - Non-Imputed Rows", x = "Days in Hospital", y = "Count")

3.3 Impairement Variables

# Create impaired_stairs and impaired_tiring_tasks
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    ple0004_NA = sum(is.na(ple0004)),
    ple0004_not_asked = sum(ple0004 %in% c(-5, -8), na.rm = TRUE),
    ple0004_other_neg = sum(ple0004 < 0 & !(ple0004 %in% c(-5, -8)), na.rm = TRUE),
    ple0005_NA = sum(is.na(ple0005)),
    ple0005_not_asked = sum(ple0005 %in% c(-5, -8), na.rm = TRUE),
    ple0005_other_neg = sum(ple0005 < 0 & !(ple0005 %in% c(-5, -8)), na.rm = TRUE)
  )

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 6
##   ple0004_NA ple0004_not_asked ple0004_other_neg ple0005_NA ple0005_not_asked
##        <int>             <int>             <int>      <int>             <int>
## 1          0             19692                29          0             19692
## # ℹ 1 more variable: ple0005_other_neg <int>
# Step 2: Impute ple0004 and ple0005 with 2-Year Cap
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    ple0004_imputed = ifelse(ple0004 %in% c(-5, -8), 1, 0),
    ple0005_imputed = ifelse(ple0005 %in% c(-5, -8), 1, 0),
    imputed_flag = ifelse(ple0004_imputed == 1 | ple0005_imputed == 1, 1, 0),

    ple0004 = case_when(
      ple0004 < -2 ~ NA_real_,
      ple0004 %in% c(-5, -8) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0004), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0004), NA_real_)
        coalesce(lag_val, lead_val)
      },
      TRUE ~ ple0004
    ),
    ple0005 = case_when(
      ple0005 < -2 ~ NA_real_,
      ple0005 %in% c(-5, -8) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0005), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0005), NA_real_)
        coalesce(lag_val, lead_val)
      },
      TRUE ~ ple0005
    )
  ) %>%
  ungroup() %>%
  mutate(
    impaired_stairs = ifelse(ple0004 %in% c(1, 2), 1, 0),
    impaired_tiring_tasks = ifelse(ple0005 %in% c(1, 2), 1, 0)
  )

# Step 3: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
  summarise(
    ple0004_NA = sum(is.na(ple0004)),
    ple0005_NA = sum(is.na(ple0005))
  )

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 2
##   ple0004_NA ple0005_NA
##        <int>      <int>
## 1      19692      19692
# Step 4: Descriptive Statistics
# Descriptive statistics for imputed rows
imputed_stats <- main %>%
  filter(imputed_flag == 1) %>%
  summarise(
    Impaired_Stairs_Mean = mean(impaired_stairs, na.rm = TRUE),
    Impaired_Stairs_Median = median(impaired_stairs, na.rm = TRUE),
    Impaired_Stairs_Q1 = quantile(impaired_stairs, 0.25, na.rm = TRUE),
    Impaired_Stairs_Q3 = quantile(impaired_stairs, 0.75, na.rm = TRUE),
    Impaired_Tasks_Mean = mean(impaired_tiring_tasks, na.rm = TRUE),
    Impaired_Tasks_Median = median(impaired_tiring_tasks, na.rm = TRUE),
    Impaired_Tasks_Q1 = quantile(impaired_tiring_tasks, 0.25, na.rm = TRUE),
    Impaired_Tasks_Q3 = quantile(impaired_tiring_tasks, 0.75, na.rm = TRUE),
    Count = n()
  )

# Descriptive statistics for non-imputed rows
non_imputed_stats <- main %>%
  filter(imputed_flag == 0) %>%
  summarise(
    Impaired_Stairs_Mean = mean(impaired_stairs, na.rm = TRUE),
    Impaired_Stairs_Median = median(impaired_stairs, na.rm = TRUE),
    Impaired_Stairs_Q1 = quantile(impaired_stairs, 0.25, na.rm = TRUE),
    Impaired_Stairs_Q3 = quantile(impaired_stairs, 0.75, na.rm = TRUE),
    Impaired_Tasks_Mean = mean(impaired_tiring_tasks, na.rm = TRUE),
    Impaired_Tasks_Median = median(impaired_tiring_tasks, na.rm = TRUE),
    Impaired_Tasks_Q1 = quantile(impaired_tiring_tasks, 0.25, na.rm = TRUE),
    Impaired_Tasks_Q3 = quantile(impaired_tiring_tasks, 0.75, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 9
##   Impaired_Stairs_Mean Impaired_Stairs_Median Impaired_Stairs_Q1
##                  <dbl>                  <dbl>              <dbl>
## 1                    0                      0                  0
## # ℹ 6 more variables: Impaired_Stairs_Q3 <dbl>, Impaired_Tasks_Mean <dbl>,
## #   Impaired_Tasks_Median <dbl>, Impaired_Tasks_Q1 <dbl>,
## #   Impaired_Tasks_Q3 <dbl>, Count <int>
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 9
##   Impaired_Stairs_Mean Impaired_Stairs_Median Impaired_Stairs_Q1
##                  <dbl>                  <dbl>              <dbl>
## 1                0.592                      1                  0
## # ℹ 6 more variables: Impaired_Stairs_Q3 <dbl>, Impaired_Tasks_Mean <dbl>,
## #   Impaired_Tasks_Median <dbl>, Impaired_Tasks_Q1 <dbl>,
## #   Impaired_Tasks_Q3 <dbl>, Count <int>
# Step 5: Visualizations

# Impaired Stairs
# All rows
ggplot(main, aes(x = factor(impaired_stairs))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Impaired Stairs - All Rows",
    x = "Impaired Stairs (1 = Yes, 0 = No)",
    y = "Count"
  ) +
  theme_minimal()

# Imputed rows only
ggplot(main %>% filter(imputed_flag == 1), aes(x = factor(impaired_stairs))) +
  geom_bar(fill = "orange", color = "black") +
  labs(
    title = "Impaired Stairs - Imputed Rows",
    x = "Impaired Stairs (1 = Yes, 0 = No)",
    y = "Count"
  ) +
  theme_minimal()

# Non-imputed rows only
ggplot(main %>% filter(imputed_flag == 0), aes(x = factor(impaired_stairs))) +
  geom_bar(fill = "green", color = "black") +
  labs(
    title = "Impaired Stairs - Non-Imputed Rows",
    x = "Impaired Stairs (1 = Yes, 0 = No)",
    y = "Count"
  ) +
  theme_minimal()

# Impaired Tiring Tasks
# All rows
ggplot(main, aes(x = factor(impaired_tiring_tasks))) +
  geom_bar(fill = "coral", color = "black") +
  labs(
    title = "Impaired Tiring Tasks - All Rows",
    x = "Impaired Tasks (1 = Yes, 0 = No)",
    y = "Count"
  ) +
  theme_minimal()

# Imputed rows only
ggplot(main %>% filter(imputed_flag == 1), aes(x = factor(impaired_tiring_tasks))) +
  geom_bar(fill = "orange", color = "black") +
  labs(
    title = "Impaired Tiring Tasks - Imputed Rows",
    x = "Impaired Tasks (1 = Yes, 0 = No)",
    y = "Count"
  ) +
  theme_minimal()

# Non-imputed rows only
ggplot(main %>% filter(imputed_flag == 0), aes(x = factor(impaired_tiring_tasks))) +
  geom_bar(fill = "green", color = "black") +
  labs(
    title = "Impaired Tiring Tasks - Non-Imputed Rows",
    x = "Impaired Tasks (1 = Yes, 0 = No)",
    y = "Count"
  ) +
  theme_minimal()

3.4. Illnesses: Special Potential Specification because surveyed 2011 and above, every other year

# Create illnesses variable
illness_vars <- c("ple0011", "ple0012", "ple0013", "ple0014", "ple0015", 
                  "ple0016", "ple0017", "ple0018", "ple0019", "ple0020", 
                  "ple0021", "ple0022", "ple0023")

# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(across(
    all_of(illness_vars),
    list(
      NA_count = function(x) sum(is.na(x)),                       # Count NAs
      not_asked = function(x) sum(x %in% c(-8, -5), na.rm = TRUE), # Count -8, -5
      other_neg = function(x) sum(x < 0 & !(x %in% c(-8, -5)), na.rm = TRUE) # Count other negatives
    )
  ))

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 39
##   ple0011_NA_count ple0011_not_asked ple0011_other_neg ple0012_NA_count
##              <int>             <int>             <int>            <int>
## 1                0             25068              7535                0
## # ℹ 35 more variables: ple0012_not_asked <int>, ple0012_other_neg <int>,
## #   ple0013_NA_count <int>, ple0013_not_asked <int>, ple0013_other_neg <int>,
## #   ple0014_NA_count <int>, ple0014_not_asked <int>, ple0014_other_neg <int>,
## #   ple0015_NA_count <int>, ple0015_not_asked <int>, ple0015_other_neg <int>,
## #   ple0016_NA_count <int>, ple0016_not_asked <int>, ple0016_other_neg <int>,
## #   ple0017_NA_count <int>, ple0017_not_asked <int>, ple0017_other_neg <int>,
## #   ple0018_NA_count <int>, ple0018_not_asked <int>, ple0018_other_neg <int>, …
# Step 2: Impute Illness Variables with Max 2-Year Lag/Lead
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(across(
    all_of(illness_vars),
    ~ case_when(
      . %in% c(-8, -5) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(., order_by = syear), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(., order_by = syear), NA_real_)
        if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
      },
      . == -2 ~ 0,  # Set -2 to 0
      TRUE ~ .  # Keep valid values
    ),
    .names = "{.col}_imputed"
  )) %>%
  ungroup() %>%
  mutate(across(all_of(illness_vars), ~ if_else(. < 0, NA_real_, .))) %>%
  mutate(
    illnesses = rowSums(across(all_of(illness_vars)), na.rm = TRUE),
    imputed_flag = if_else(rowSums(across(ends_with("_imputed"), ~ . %in% c(-8, -5))) > 0, 1, 0)
  ) %>%
  mutate(illnesses = round(illnesses))

# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
  summarise(across(
    all_of(illness_vars),
    ~ sum(is.na(.)), .names = "{.col}_NA_after"
  ))

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 13
##   ple0011_NA_after ple0012_NA_after ple0013_NA_after ple0014_NA_after
##              <int>            <int>            <int>            <int>
## 1            32603            32704            33237            31912
## # ℹ 9 more variables: ple0015_NA_after <int>, ple0016_NA_after <int>,
## #   ple0017_NA_after <int>, ple0018_NA_after <int>, ple0019_NA_after <int>,
## #   ple0020_NA_after <int>, ple0021_NA_after <int>, ple0022_NA_after <int>,
## #   ple0023_NA_after <int>
# Step 4: Descriptive Statistics for Imputed and Non-Imputed Rows
# Imputed Rows
imputed_stats <- main %>%
  filter(imputed_flag == 1) %>%
  summarise(
    Mean_Illnesses = mean(illnesses, na.rm = TRUE),
    Median_Illnesses = median(illnesses, na.rm = TRUE),
    Q1 = quantile(illnesses, 0.25, na.rm = TRUE),
    Q3 = quantile(illnesses, 0.75, na.rm = TRUE),
    Min = min(illnesses, na.rm = TRUE),
    Max = max(illnesses, na.rm = TRUE),
    Count = n()
  )

# Non-Imputed Rows
non_imputed_stats <- main %>%
  filter(imputed_flag == 0) %>%
  summarise(
    Mean_Illnesses = mean(illnesses, na.rm = TRUE),
    Median_Illnesses = median(illnesses, na.rm = TRUE),
    Q1 = quantile(illnesses, 0.25, na.rm = TRUE),
    Q3 = quantile(illnesses, 0.75, na.rm = TRUE),
    Min = min(illnesses, na.rm = TRUE),
    Max = max(illnesses, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 7
##   Mean_Illnesses Median_Illnesses    Q1    Q3   Min   Max Count
##            <dbl>            <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1          0.106                0     0     0     0     6 10190
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 7
##   Mean_Illnesses Median_Illnesses    Q1    Q3   Min   Max Count
##            <dbl>            <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1          0.862                0     0     1     0    11 23959
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = illnesses)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(
    title = "Distribution of Illness Counts - All Rows",
    x = "Number of Illnesses",
    y = "Count"
  ) +
  theme_minimal()

# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = illnesses)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(
    title = "Distribution of Illness Counts - Imputed Rows",
    x = "Number of Illnesses",
    y = "Count"
  ) +
  theme_minimal()

# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = illnesses)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  labs(
    title = "Distribution of Illness Counts - Non-Imputed Rows",
    x = "Number of Illnesses",
    y = "Count"
  ) +
  theme_minimal()

3.2 Predictors

3.2.1 Life Satisfaction and Health Satisfaction

# Create life_satisfaction and health_satisfaction variables
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    plh0182_NA = sum(is.na(plh0182)),
    plh0182_not_asked = sum(plh0182 %in% c(-8, -5), na.rm = TRUE),
    plh0182_other_neg = sum(plh0182 < 0 & !(plh0182 %in% c(-8, -5)), na.rm = TRUE),
    plh0171_NA = sum(is.na(plh0171)),
    plh0171_not_asked = sum(plh0171 %in% c(-8, -5), na.rm = TRUE),
    plh0171_other_neg = sum(plh0171 < 0 & !(plh0171 %in% c(-8, -5)), na.rm = TRUE)
  )

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 6
##   plh0182_NA plh0182_not_asked plh0182_other_neg plh0171_NA plh0171_not_asked
##        <int>             <int>             <int>      <int>             <int>
## 1          0               296                74          0               394
## # ℹ 1 more variable: plh0171_other_neg <int>
# Step 2: Impute Values for plh0182 and plh0171 with Max 2-Year Lag/Lead
main <- main %>%
  arrange(pid, syear) %>%  # Ensure proper sorting
  group_by(pid) %>%
  mutate(
    # Flag rows that are imputed
    imputed_flag_life = if_else(plh0182 %in% c(-5, -8) | is.na(plh0182), 1, 0),
    imputed_flag_health = if_else(plh0171 %in% c(-5, -8) | is.na(plh0171), 1, 0),

    # Replace -5 and -8 with NA
    plh0182 = case_when(plh0182 %in% c(-5, -8) ~ NA_real_, TRUE ~ plh0182),
    plh0171 = case_when(plh0171 %in% c(-5, -8) ~ NA_real_, TRUE ~ plh0171),

    # Impute plh0182 (life satisfaction)
    life_satisfaction = if_else(
      is.na(plh0182),
      coalesce(
        if_else(syear - lag(syear) <= 2 & !is.na(lag(plh0182)), lag(plh0182), NA_real_),
        if_else(lead(syear) - syear <= 2 & !is.na(lead(plh0182)), lead(plh0182), NA_real_)
      ),
      plh0182
    ),

    # Impute plh0171 (health satisfaction)
    health_satisfaction = if_else(
      is.na(plh0171),
      coalesce(
        if_else(syear - lag(syear) <= 2 & !is.na(lag(plh0171)), lag(plh0171), NA_real_),
        if_else(lead(syear) - syear <= 2 & !is.na(lead(plh0171)), lead(plh0171), NA_real_)
      ),
      plh0171
    )
  ) %>%
  ungroup()

# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
  summarise(
    life_satisfaction_NA = sum(is.na(life_satisfaction)),
    health_satisfaction_NA = sum(is.na(health_satisfaction))
  )

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 2
##   life_satisfaction_NA health_satisfaction_NA
##                  <int>                  <int>
## 1                  185                    185
# Step 4: Descriptive Statistics
# Imputed Rows
imputed_stats <- main %>%
  filter(imputed_flag_life == 1 | imputed_flag_health == 1) %>%
  summarise(
    Life_Satisfaction_Mean = mean(life_satisfaction, na.rm = TRUE),
    Life_Satisfaction_Median = median(life_satisfaction, na.rm = TRUE),
    Health_Satisfaction_Mean = mean(health_satisfaction, na.rm = TRUE),
    Health_Satisfaction_Median = median(health_satisfaction, na.rm = TRUE),
    Count = n()
  )

# Non-Imputed Rows
non_imputed_stats <- main %>%
  filter(imputed_flag_life == 0 & imputed_flag_health == 0) %>%
  summarise(
    Life_Satisfaction_Mean = mean(life_satisfaction, na.rm = TRUE),
    Life_Satisfaction_Median = median(life_satisfaction, na.rm = TRUE),
    Health_Satisfaction_Mean = mean(health_satisfaction, na.rm = TRUE),
    Health_Satisfaction_Median = median(health_satisfaction, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 5
##   Life_Satisfaction_Mean Life_Satisfaction_Median Health_Satisfaction_Mean
##                    <dbl>                    <dbl>                    <dbl>
## 1                   7.20                        8                     6.08
## # ℹ 2 more variables: Health_Satisfaction_Median <dbl>, Count <int>
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 5
##   Life_Satisfaction_Mean Life_Satisfaction_Median Health_Satisfaction_Mean
##                    <dbl>                    <dbl>                    <dbl>
## 1                   6.77                        7                     5.55
## # ℹ 2 more variables: Health_Satisfaction_Median <dbl>, Count <int>
# Step 5: Visualizations
# All Rows: Life Satisfaction
ggplot(main, aes(x = factor(life_satisfaction))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Life Satisfaction - All Rows",
    x = "Life Satisfaction (0-10)",
    y = "Count"
  ) +
  theme_minimal()

# Imputed Rows Only
ggplot(main %>% filter(imputed_flag_life == 1), aes(x = factor(life_satisfaction))) +
  geom_bar(fill = "orange", color = "black") +
  labs(
    title = "Life Satisfaction - Imputed Rows",
    x = "Life Satisfaction (0-10)",
    y = "Count"
  ) +
  theme_minimal()

# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag_life == 0), aes(x = factor(life_satisfaction))) +
  geom_bar(fill = "green", color = "black") +
  labs(
    title = "Life Satisfaction - Non-Imputed Rows",
    x = "Life Satisfaction (0-10)",
    y = "Count"
  ) +
  theme_minimal()

# All Rows: Health Satisfaction
ggplot(main, aes(x = factor(health_satisfaction))) +
  geom_bar(fill = "coral", color = "black") +
  labs(
    title = "Health Satisfaction - All Rows",
    x = "Health Satisfaction (0-10)",
    y = "Count"
  ) +
  theme_minimal()

# Imputed Rows Only
ggplot(main %>% filter(imputed_flag_health == 1), aes(x = factor(health_satisfaction))) +
  geom_bar(fill = "orange", color = "black") +
  labs(
    title = "Health Satisfaction - Imputed Rows",
    x = "Health Satisfaction (0-10)",
    y = "Count"
  ) +
  theme_minimal()

# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag_health == 0), aes(x = factor(health_satisfaction))) +
  geom_bar(fill = "green", color = "black") +
  labs(
    title = "Health Satisfaction - Non-Imputed Rows",
    x = "Health Satisfaction (0-10)",
    y = "Count"
  ) +
  theme_minimal()

3.2.2 BMI

# Process height and weight
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    height_NA = sum(is.na(ple0006)),
    weight_NA = sum(is.na(ple0007)),
    height_invalid = sum(ple0006 > 245 | ple0006 <= 0, na.rm = TRUE),
    weight_invalid = sum(ple0007 <= 0, na.rm = TRUE)
  )

print("Breakdown of missing and invalid values before imputation:")
## [1] "Breakdown of missing and invalid values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 4
##   height_NA weight_NA height_invalid weight_invalid
##       <int>     <int>          <int>          <int>
## 1         0         0          19951          20062
# Step 2: Process and Impute Height with Lag/Lead (<= 2 years)
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    # Track imputation flag for height
    height_imputed = if_else(ple0006 > 245 | ple0006 <= 0 | is.na(ple0006), 1, 0),
    # Handle valid height and imputation
    height = case_when(
      ple0006 > 245 ~ NA_real_,
      ple0006 > 0 ~ ple0006,
      TRUE ~ NA_real_
    ),
    height = case_when(
      is.na(height) & syear - lag(syear) <= 2 ~ lag(height),
      is.na(height) & lead(syear) - syear <= 2 ~ lead(height),
      TRUE ~ height
    )
  ) %>%
  ungroup()

# Step 3: Process and Impute Weight with Lag/Lead (<= 2 years)
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    # Track imputation flag for weight
    weight_imputed = if_else(ple0007 <= 0 | is.na(ple0007), 1, 0),
    # Handle valid weight and imputation
    weight = case_when(
      ple0007 > 0 ~ ple0007,
      TRUE ~ NA_real_
    ),
    weight = case_when(
      is.na(weight) & syear - lag(syear) <= 2 ~ lag(weight),
      is.na(weight) & lead(syear) - syear <= 2 ~ lead(weight),
      TRUE ~ weight
    )
  ) %>%
  ungroup()

# Step 4: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
  summarise(
    height_NA = sum(is.na(height)),
    weight_NA = sum(is.na(weight)),
    height_imputed = sum(height_imputed),
    weight_imputed = sum(weight_imputed)
  )

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 4
##   height_NA weight_NA height_imputed weight_imputed
##       <int>     <int>          <dbl>          <dbl>
## 1     13743     13880          19951          20062
# Step 5: Calculate BMI and Create Categorical Variable
main <- main %>%
  mutate(
    BMI = weight / ((height / 100) ^ 2),  # Calculate BMI
    bmi_categorical = case_when(
      BMI < 18.5 ~ "Underweight",
      BMI >= 18.5 & BMI < 25 ~ "Normal weight",
      BMI >= 25 & BMI < 30 ~ "Overweight",
      BMI >= 30 ~ "Obese",
      TRUE ~ NA_character_
    ),
    bmi_categorical = factor(
      bmi_categorical,
      levels = c("Underweight", "Normal weight", "Overweight", "Obese")
    )
  )

# Step 6: Descriptive Statistics for Imputed and Non-Imputed Rows
# Descriptive stats for all rows
all_stats <- main %>%
  summarise(
    Mean = mean(BMI, na.rm = TRUE),
    Median = median(BMI, na.rm = TRUE),
    Q1 = quantile(BMI, 0.25, na.rm = TRUE),
    Q3 = quantile(BMI, 0.75, na.rm = TRUE),
    Min = min(BMI, na.rm = TRUE),
    Max = max(BMI, na.rm = TRUE),
    Count = n()
  )

# Descriptive stats for imputed rows
imputed_stats <- main %>%
  filter(height_imputed == 1 | weight_imputed == 1) %>%
  summarise(
    Mean = mean(BMI, na.rm = TRUE),
    Median = median(BMI, na.rm = TRUE),
    Q1 = quantile(BMI, 0.25, na.rm = TRUE),
    Q3 = quantile(BMI, 0.75, na.rm = TRUE),
    Min = min(BMI, na.rm = TRUE),
    Max = max(BMI, na.rm = TRUE),
    Count = n()
  )

# Descriptive stats for non-imputed rows
non_imputed_stats <- main %>%
  filter(height_imputed == 0 & weight_imputed == 0) %>%
  summarise(
    Mean = mean(BMI, na.rm = TRUE),
    Median = median(BMI, na.rm = TRUE),
    Q1 = quantile(BMI, 0.25, na.rm = TRUE),
    Q3 = quantile(BMI, 0.75, na.rm = TRUE),
    Min = min(BMI, na.rm = TRUE),
    Max = max(BMI, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for All Rows:")
## [1] "Descriptive Statistics for All Rows:"
print(all_stats)
## # A tibble: 1 × 7
##    Mean Median    Q1    Q3   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1  26.9   26.2  23.3  29.7  11.0  95.2 34149
print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 7
##    Mean Median    Q1    Q3   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1  27.2   26.4  23.5  29.8  12.6  95.2 20070
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 7
##    Mean Median    Q1    Q3   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1  26.8   26.0  23.1  29.4  11.0  95.2 14079
# Step 7: Visualizations
# Histogram for all rows
ggplot(main, aes(x = BMI)) +
  geom_histogram(binwidth = 1, fill = "purple", color = "black", alpha = 0.7) +
  labs(
    title = "BMI Distribution - All Rows",
    x = "BMI",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 13891 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Histogram for imputed rows
ggplot(main %>% filter(height_imputed == 1 | weight_imputed == 1), aes(x = BMI)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black", alpha = 0.7) +
  labs(
    title = "BMI Distribution - Imputed Rows",
    x = "BMI",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 13891 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Histogram for non-imputed rows
ggplot(main %>% filter(height_imputed == 0 & weight_imputed == 0), aes(x = BMI)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
  labs(
    title = "BMI Distribution - Non-Imputed Rows",
    x = "BMI",
    y = "Count"
  ) +
  theme_minimal()

3.2.3 Smoking and Number of Cigarettes

# Create `smoking_dummy`
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    ple0081_h_NA = sum(is.na(ple0081_h)),
    ple0081_h_invalid = sum(ple0081_h %in% c(-5, -8), na.rm = TRUE),
    ple0086_v4_NA = sum(is.na(ple0086_v4)),
    ple0086_v4_invalid = sum(ple0086_v4 %in% c(-5, -8), na.rm = TRUE)
  )

print("Breakdown of missing and invalid values before imputation:")
## [1] "Breakdown of missing and invalid values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 4
##   ple0081_h_NA ple0081_h_invalid ple0086_v4_NA ple0086_v4_invalid
##          <int>             <int>         <int>              <int>
## 1            0             19591             0              21578
# Step 2: Impute Smoking Status (ple0081_h) with Lag and Lead (Max 2 Years)
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    smoking_imputed = if_else(ple0081_h %in% c(-5, -8), 1, 0),
    ple0081_h = case_when(
      ple0081_h %in% c(-5, -8) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0081_h), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0081_h), NA_real_)
        if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
      },
      ple0081_h == -2 ~ 2,  # Treat -2 as "no" (2)
      ple0081_h > 0 ~ ple0081_h,
      TRUE ~ NA_real_
    ),
    smoking_dummy = if_else(ple0081_h == 1, 1, 0)  # Create smoking dummy (1 = Smoker, 0 = Non-Smoker)
  ) %>%
  ungroup()

# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    ple0086_v4_NA = sum(is.na(ple0086_v4)),
    ple0086_v4_invalid = sum(ple0086_v4 %in% c(-5, -8), na.rm = TRUE),
    ple0086_v4_negative = sum(ple0086_v4 < 0, na.rm = TRUE)
  )

print("Breakdown of invalid values before imputation:")
## [1] "Breakdown of invalid values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 3
##   ple0086_v4_NA ple0086_v4_invalid ple0086_v4_negative
##           <int>              <int>               <int>
## 1             0              21578               34034
# Step 2: Impute Number of Cigarettes with Lag/Lead (Max 2 Years)
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    cigarettes_imputed = if_else(ple0086_v4 %in% c(-5, -8), 1, 0),  # Flag imputed rows
    ple0086_v4 = case_when(
      ple0086_v4 %in% c(-5, -8) ~ {
        # Impute using lag and lead within 2 years
        lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0086_v4), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0086_v4), NA_real_)
        valid_val <- if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
        if_else(valid_val >= 0, valid_val, NA_real_)  # Ensure valid positive values only
      },
      ple0086_v4 == -2 ~ 0,  # Treat -2 as 0
      ple0086_v4 >= 0 ~ ple0086_v4,  # Keep valid positive values
      TRUE ~ NA_real_  # Set other invalid cases to NA
    ),
    nb_of_cigarettes = ple0086_v4
  ) %>%
  ungroup()

# Step 3: Remove Remaining Invalid or Negative Values
main <- main %>%
  mutate(nb_of_cigarettes = if_else(nb_of_cigarettes < 0, NA_real_, nb_of_cigarettes)) %>%
  filter(!is.na(nb_of_cigarettes))


# Step 4: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
  summarise(
    ple0081_h_NA = sum(is.na(ple0081_h)),
    ple0086_v4_NA = sum(is.na(ple0086_v4))
  )

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 2
##   ple0081_h_NA ple0086_v4_NA
##          <int>         <int>
## 1            9             0
# Step 5: Descriptive Statistics
# Smoking Dummy
smoking_stats_all <- main %>%
  summarise(
    Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
    Count = n()
  )

smoking_stats_imputed <- main %>%
  filter(smoking_imputed == 1) %>%
  summarise(
    Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
    Count = n()
  )

smoking_stats_non_imputed <- main %>%
  filter(smoking_imputed == 0) %>%
  summarise(
    Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Smoking (All Rows):")
## [1] "Descriptive Statistics for Smoking (All Rows):"
print(smoking_stats_all)
## # A tibble: 1 × 2
##   Mean_Smoking Count
##          <dbl> <int>
## 1        0.257 12633
print("Descriptive Statistics for Smoking (Imputed Rows):")
## [1] "Descriptive Statistics for Smoking (Imputed Rows):"
print(smoking_stats_imputed)
## # A tibble: 1 × 2
##   Mean_Smoking Count
##          <dbl> <int>
## 1        0.887    62
print("Descriptive Statistics for Smoking (Non-Imputed Rows):")
## [1] "Descriptive Statistics for Smoking (Non-Imputed Rows):"
print(smoking_stats_non_imputed)
## # A tibble: 1 × 2
##   Mean_Smoking Count
##          <dbl> <int>
## 1        0.254 12571
# Number of Cigarettes
cigarettes_stats_all <- main %>%
  summarise(
    Mean_Cigarettes = mean(nb_of_cigarettes, na.rm = TRUE),
    Median = median(nb_of_cigarettes, na.rm = TRUE),
    Q1 = quantile(nb_of_cigarettes, 0.25, na.rm = TRUE),
    Q3 = quantile(nb_of_cigarettes, 0.75, na.rm = TRUE),
    Min = min(nb_of_cigarettes, na.rm = TRUE),
    Max = max(nb_of_cigarettes, na.rm = TRUE),
    Count = n()
  )

cigarettes_stats_imputed <- main %>%
  filter(cigarettes_imputed == 1) %>%
  summarise(
    Mean_Cigarettes = mean(nb_of_cigarettes, na.rm = TRUE),
    Median = median(nb_of_cigarettes, na.rm = TRUE),
    Q1 = quantile(nb_of_cigarettes, 0.25, na.rm = TRUE),
    Q3 = quantile(nb_of_cigarettes, 0.75, na.rm = TRUE),
    Min = min(nb_of_cigarettes, na.rm = TRUE),
    Max = max(nb_of_cigarettes, na.rm = TRUE),
    Count = n()
  )

cigarettes_stats_non_imputed <- main %>%
  filter(cigarettes_imputed == 0) %>%
  summarise(
    Mean_Cigarettes = mean(nb_of_cigarettes, na.rm = TRUE),
    Median = median(nb_of_cigarettes, na.rm = TRUE),
    Q1 = quantile(nb_of_cigarettes, 0.25, na.rm = TRUE),
    Q3 = quantile(nb_of_cigarettes, 0.75, na.rm = TRUE),
    Min = min(nb_of_cigarettes, na.rm = TRUE),
    Max = max(nb_of_cigarettes, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Number of Cigarettes (All Rows):")
## [1] "Descriptive Statistics for Number of Cigarettes (All Rows):"
print(cigarettes_stats_all)
## # A tibble: 1 × 7
##   Mean_Cigarettes Median    Q1    Q3 Min       Max       Count
##             <dbl>  <dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1           0.106      0     0     0 0         30        12633
print("Descriptive Statistics for Number of Cigarettes (Imputed Rows):")
## [1] "Descriptive Statistics for Number of Cigarettes (Imputed Rows):"
print(cigarettes_stats_imputed)
## # A tibble: 1 × 7
##   Mean_Cigarettes Median    Q1    Q3 Min       Max       Count
##             <dbl>  <dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1            7.30      5     3    10 0         30           64
print("Descriptive Statistics for Number of Cigarettes (Non-Imputed Rows):")
## [1] "Descriptive Statistics for Number of Cigarettes (Non-Imputed Rows):"
print(cigarettes_stats_non_imputed)
## # A tibble: 1 × 7
##   Mean_Cigarettes Median    Q1    Q3 Min       Max       Count
##             <dbl>  <dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1          0.0695      0     0     0 0         30        12569
# Step 6: Visualizations
# Smoking Dummy
ggplot(main, aes(x = factor(smoking_dummy))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Smoking Status - All Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
  theme_minimal()

ggplot(main %>% filter(smoking_imputed == 1), aes(x = factor(smoking_dummy))) +
  geom_bar(fill = "orange", color = "black") +
  labs(title = "Smoking Status - Imputed Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
  theme_minimal()

ggplot(main %>% filter(smoking_imputed == 0), aes(x = factor(smoking_dummy))) +
  geom_bar(fill = "green", color = "black") +
  labs(title = "Smoking Status - Non-Imputed Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
  theme_minimal()

# Number of Cigarettes
ggplot(main, aes(x = nb_of_cigarettes)) +
  geom_histogram(binwidth = 1, fill = "purple", color = "black") +
  labs(title = "Number of Cigarettes - All Rows", x = "Cigarettes Per Day", y = "Count") +
  theme_minimal()

ggplot(main %>% filter(cigarettes_imputed == 1), aes(x = nb_of_cigarettes)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(title = "Number of Cigarettes - Imputed Rows", x = "Cigarettes Per Day", y = "Count") +
  theme_minimal()

ggplot(main %>% filter(cigarettes_imputed == 0), aes(x = nb_of_cigarettes)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  labs(title = "Number of Cigarettes - Non-Imputed Rows", x = "Cigarettes Per Day", y = "Count") +
  theme_minimal()

3.2.4 Diet: Special Potential Specification because surveyed every second year from 2004-2014

# Create diet_healthy and diet_dummy
main <- main %>%
  mutate(
    ple0095 = ifelse(ple0095 < 0, NA_real_, ple0095),
    diet_categorical = factor(
      ple0095, levels = c(1, 2, 3, 4),
      labels = c("very healthy", "somewhat healthy", "somewhat unhealthy", "very unhealthy")
    ),
    diet_dummy = case_when(
      ple0095 %in% c(1, 2) ~ 1,
      ple0095 %in% c(3, 4) ~ 0,
      TRUE ~ NA_real_
    )
  ) %>%
  select(-ple0095)
#Visusalition: Bar Plot
ggplot(main, aes(x = diet_categorical)) +
  geom_bar(fill = "darkgreen", color = "black") +
  labs(
    title = "Distribution of Diet Categories",
    x = "Diet Category",
    y = "Count"
  ) +
  theme_minimal()

4. Control Variables

4.1 Worries about job security, financial situation, and economic situation

# Create variables for worries about job security, financial situation, and economic situation
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    plh0042_NA = sum(is.na(plh0042)),
    plh0042_invalid = sum(plh0042 %in% c(-5, -8), na.rm = TRUE),
    plh0033_NA = sum(is.na(plh0033)),
    plh0033_invalid = sum(plh0033 %in% c(-5, -8), na.rm = TRUE),
    plh0032_NA = sum(is.na(plh0032)),
    plh0032_invalid = sum(plh0032 %in% c(-5, -8), na.rm = TRUE)
  )

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 6
##   plh0042_NA plh0042_invalid plh0033_NA plh0033_invalid plh0032_NA
##        <int>           <int>      <int>           <int>      <int>
## 1          0               0          0               0          0
## # ℹ 1 more variable: plh0032_invalid <int>
# Step 2: Lag/Lead Imputation (Max 2 Years)
main <- main %>%
  arrange(pid, syear) %>%  # Ensure data is ordered by pid and syear
  group_by(pid) %>%
  mutate(
    # Impute -5 or -8 for `plh0042` with max 2-year lag/lead
    worries_job_imputed = if_else(plh0042 %in% c(-5, -8), 1, 0),
    plh0042 = case_when(
      plh0042 %in% c(-5, -8) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0042), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(plh0042), NA_real_)
        if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
      },
      TRUE ~ plh0042
    ),

    # Repeat for `plh0033`
    worries_financial_imputed = if_else(plh0033 %in% c(-5, -8), 1, 0),
    plh0033 = case_when(
      plh0033 %in% c(-5, -8) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0033), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(plh0033), NA_real_)
        if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
      },
      TRUE ~ plh0033
    ),

    # Repeat for `plh0032`
    worries_economic_imputed = if_else(plh0032 %in% c(-5, -8), 1, 0),
    plh0032 = case_when(
      plh0032 %in% c(-5, -8) ~ {
        lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0032), NA_real_)
        lead_val <- if_else(lead(syear) - syear <= 2, lead(plh0032), NA_real_)
        if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
      },
      TRUE ~ plh0032
    )
  ) %>%
  ungroup() 

# Step 3: Create Categorical and Dummy Variables
main <- main %>%
  mutate(
    worr_job_categorical = factor(case_when(plh0042 == -2 ~ 3, TRUE ~ plh0042),
                                  levels = c(3, 2, 1), labels = c("no worries", "worries a little", "worries a lot")),
    worr_financial_categorical = factor(case_when(plh0033 == -2 ~ 3, TRUE ~ plh0033),
                                        levels = c(3, 2, 1), labels = c("no worries", "worries a little", "worries a lot")),
    worr_economic_categorical = factor(case_when(plh0032 == -2 ~ 3, TRUE ~ plh0032),
                                       levels = c(3, 2, 1), labels = c("no worries", "worries a little", "worries a lot")),
    worr_job_dummy = ifelse(worr_job_categorical %in% c("worries a lot", "worries a little"), 1, 0),
    worr_financial_dummy = ifelse(worr_financial_categorical %in% c("worries a lot", "worries a little"), 1, 0),
    worr_economic_dummy = ifelse(worr_economic_categorical %in% c("worries a lot", "worries a little"), 1, 0)
  )

Continuation of code

# Define descriptive_stats function
descriptive_stats <- function(data, condition) {
  data %>%
    filter(!!rlang::enquo(condition)) %>%  # Use enquo for capturing condition
    summarise(
      Job_Worries_Mean = mean(as.numeric(worr_job_dummy), na.rm = TRUE),
      Financial_Worries_Mean = mean(as.numeric(worr_financial_dummy), na.rm = TRUE),
      Economic_Worries_Mean = mean(as.numeric(worr_economic_dummy), na.rm = TRUE),
      Count = n()
    )
}
# Calculate statistics for all rows, imputed rows, and non-imputed rows
all_stats <- descriptive_stats(main, TRUE)
imputed_stats <- descriptive_stats(main, worries_job_imputed == 1 | worries_financial_imputed == 1 | worries_economic_imputed == 1)
non_imputed_stats <- descriptive_stats(main, worries_job_imputed == 0 & worries_financial_imputed == 0 & worries_economic_imputed == 0)

print("Descriptive Statistics for All Rows:")
## [1] "Descriptive Statistics for All Rows:"
print(all_stats)
## # A tibble: 1 × 4
##   Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
##              <dbl>                  <dbl>                 <dbl> <int>
## 1            0.188                  0.683                 0.835 12633
print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 4
##   Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
##              <dbl>                  <dbl>                 <dbl> <int>
## 1              NaN                    NaN                   NaN     0
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 4
##   Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
##              <dbl>                  <dbl>                 <dbl> <int>
## 1            0.188                  0.683                 0.835 12633
# Step 5: Visualizations
# Drop rows with NAs in worry-related columns
main <- main %>%
  drop_na(worr_job_categorical, worr_financial_categorical, worr_economic_categorical)

# Distribution of Worry Levels - All Rows
main %>%
  select(worr_job_categorical, worr_financial_categorical, worr_economic_categorical) %>%
  pivot_longer(cols = everything(), names_to = "Worry_Type", values_to = "Worry_Level") %>%
  ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
  geom_bar(position = "dodge", color = "black") +
  labs(
    title = "Distribution of Worry Levels - All Rows",
    x = "Worry Level",
    y = "Count",
    fill = "Worry Type"
  ) +
  theme_minimal()

# Imputed Rows
main %>%
  filter(worries_job_imputed == 1 | worries_financial_imputed == 1 | worries_economic_imputed == 1) %>%
  pivot_longer(cols = c(worr_job_categorical, worr_financial_categorical, worr_economic_categorical), 
               names_to = "Worry_Type", values_to = "Worry_Level") %>%
  ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
  geom_bar(position = "dodge", color = "black") +
  labs(
    title = "Distribution of Worry Levels - Imputed Rows",
    x = "Worry Level",
    y = "Count",
    fill = "Worry Type"
  ) +
  theme_minimal()

# Non-Imputed Rows
main %>%
  filter(worries_job_imputed == 0 & worries_financial_imputed == 0 & worries_economic_imputed == 0) %>%
  pivot_longer(cols = c(worr_job_categorical, worr_financial_categorical, worr_economic_categorical), 
               names_to = "Worry_Type", values_to = "Worry_Level") %>%
  ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
  geom_bar(position = "dodge", color = "black") +
  labs(
    title = "Distribution of Worry Levels - Non-Imputed Rows",
    x = "Worry Level",
    y = "Count",
    fill = "Worry Type"
  ) +
  theme_minimal()

4.2 Tenure and Employment Status

# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    NA_count = sum(is.na(pgerwzeit)),
    not_asked = sum(pgerwzeit %in% c(-8, -5), na.rm = TRUE),
    other_neg = sum(pgerwzeit < 0 & !(pgerwzeit %in% c(-8, -5)), na.rm = TRUE)
  )

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 3
##   NA_count not_asked other_neg
##      <int>     <int>     <int>
## 1        0         0      7503
# Step 2: Impute Tenure with Year Difference Logic and Max 2-Year Lag/Lead
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    tenure = ifelse(pgerwzeit %in% c(-5, -8), NA_real_, pgerwzeit),
    tenure = {
      temp <- tenure
      years <- syear
      for (i in seq_len(n())) {
        if (is.na(temp[i])) {
          lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
          lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
          
          if (length(lag_idx) > 0 && length(lead_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            lead_diff <- years[lead_idx[1]] - years[i]
            nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
            temp[i] <- temp[nearest] + ifelse(nearest %in% lag_idx, lag_diff, -lead_diff)
          } else if (length(lag_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            temp[i] <- temp[lag_idx[length(lag_idx)]] + lag_diff
          } else if (length(lead_idx) > 0) {
            lead_diff <- years[lead_idx[1]] - years[i]
            temp[i] <- temp[lead_idx[1]] - lead_diff
          }
        }
      }
      temp
    },
    tenure = ifelse(is.na(tenure) & pgerwzeit == -2, 0, tenure)
  ) %>%
  ungroup() %>%
  mutate(
    tenure = ifelse(tenure < 0, NA_real_, tenure),  # Remove any remaining negatives
    imputed_flag = if_else(is.na(pgerwzeit) | pgerwzeit %in% c(-8, -5), 1, 0)  # Track imputed rows
  )

# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
  summarise(
    NA_count_after = sum(is.na(tenure)),
    imputed_count = sum(imputed_flag)
  )

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 2
##   NA_count_after imputed_count
##            <int>         <dbl>
## 1           7503             0
# Step 4: Descriptive Statistics for Imputed and Non-Imputed Rows
# Imputed Rows
imputed_stats <- main %>%
  filter(imputed_flag == 1) %>%
  summarise(
    Mean_Tenure = mean(tenure, na.rm = TRUE),
    Median_Tenure = median(tenure, na.rm = TRUE),
    Q1 = quantile(tenure, 0.25, na.rm = TRUE),
    Q3 = quantile(tenure, 0.75, na.rm = TRUE),
    Min = min(tenure, na.rm = TRUE),
    Max = max(tenure, na.rm = TRUE),
    Count = n()
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(tenure, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-Imputed Rows
non_imputed_stats <- main %>%
  filter(imputed_flag == 0) %>%
  summarise(
    Mean_Tenure = mean(tenure, na.rm = TRUE),
    Median_Tenure = median(tenure, na.rm = TRUE),
    Q1 = quantile(tenure, 0.25, na.rm = TRUE),
    Q3 = quantile(tenure, 0.75, na.rm = TRUE),
    Min = min(tenure, na.rm = TRUE),
    Max = max(tenure, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 7
##   Mean_Tenure Median_Tenure    Q1    Q3   Min   Max Count
##         <dbl>         <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1         NaN            NA    NA    NA   Inf  -Inf     0
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 7
##   Mean_Tenure Median_Tenure    Q1    Q3   Min   Max Count
##         <dbl>         <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1        11.6          7.92  2.42  18.5     0  62.1 12370
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = tenure)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(
    title = "Distribution of Tenure - All Rows",
    x = "Tenure (Years)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 7503 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = tenure)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(
    title = "Distribution of Tenure - Imputed Rows",
    x = "Tenure (Years)",
    y = "Count"
  ) +
  theme_minimal()

# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = tenure)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  labs(
    title = "Distribution of Tenure - Non-Imputed Rows",
    x = "Tenure (Years)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 7503 rows containing non-finite outside the scale range
## (`stat_bin()`).

4.3 Net Income and Working time per week

# Transform `pglabnet` into `net_income`
# Transform `pglabnet` into `net_income`
main <- main %>%
  arrange(pid, syear) %>%  # Ensure data is ordered by pid and syear
  group_by(pid) %>%
  mutate(
    # Replace invalid values with NA
    net_income = ifelse(pglabnet %in% c(-5, -8, 99999), NA_real_, pglabnet),
    # Iterative imputation using lag and lead with a max 2-year difference
    net_income = {
      temp <- net_income
      years <- syear
      for (i in seq_len(n())) {
        if (is.na(temp[i])) {
          # Find valid lag and lead indices within a 2-year difference
          lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
          lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
          
          if (length(lag_idx) > 0 && length(lead_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            lead_diff <- years[lead_idx[1]] - years[i]
            nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
            temp_value <- temp[nearest]
            temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value)  # Set NA if negative
          } else if (length(lag_idx) > 0) {
            temp_value <- temp[lag_idx[length(lag_idx)]]
            temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value)  # Set NA if negative
          } else if (length(lead_idx) > 0) {
            temp_value <- temp[lead_idx[1]]
            temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value)  # Set NA if negative
          }
        }
      }
      temp
    },
    # Set -2 to NA
    net_income = ifelse(net_income < 0, NA_real_, net_income)  # Ensure no negative values
  ) %>%
  ungroup() %>%
  mutate(
    # Remove remaining NA values in ALL rows
    net_income = ifelse(is.na(net_income), NA_real_, net_income),
    imputed_flag_net_income = ifelse(is.na(pglabnet) | pglabnet %in% c(-5, -8, 99999), 1, 0)
  ) %>%
  filter(!is.na(net_income))  # Remove rows with NA net_income

# Transform `pgtatzeit` into `work_time_weekly`
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    # Replace invalid values with NA
    work_time_weekly = ifelse(pgtatzeit %in% c(-5, -8), NA_real_, pgtatzeit),
    # Iterative imputation using lag and lead with a max 2-year difference
    work_time_weekly = {
      temp <- work_time_weekly
      years <- syear
      for (i in seq_len(n())) {
        if (is.na(temp[i])) {
          # Find valid lag and lead indices within a 2-year difference
          lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
          lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
          
          if (length(lag_idx) > 0 && length(lead_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            lead_diff <- years[lead_idx[1]] - years[i]
            nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
            temp_value <- temp[nearest]
            temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value)  # Set NA if negative
          } else if (length(lag_idx) > 0) {
            temp_value <- temp[lag_idx[length(lag_idx)]]
            temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value)  # Set NA if negative
          } else if (length(lead_idx) > 0) {
            temp_value <- temp[lead_idx[1]]
            temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value)  # Set NA if negative
          }
        }
      }
      temp
    },
    # Set negative values to NA
    work_time_weekly = ifelse(work_time_weekly < 0, NA_real_, work_time_weekly)
  ) %>%
  ungroup() %>%
  mutate(
    # Remove remaining NA values in ALL rows
    work_time_weekly = ifelse(is.na(work_time_weekly), NA_real_, work_time_weekly),
    imputed_flag_work_time = ifelse(is.na(pgtatzeit) | pgtatzeit %in% c(-5, -8), 1, 0)
  ) %>%
  filter(!is.na(work_time_weekly))  # Remove rows with NA work_time_weekly

# Summary Statistics for Net Income
# All rows
net_income_all <- main %>%
  summarise(
    Mean = mean(net_income, na.rm = TRUE),
    Median = median(net_income, na.rm = TRUE),
    Min = min(net_income, na.rm = TRUE),
    Max = max(net_income, na.rm = TRUE),
    Count = sum(!is.na(net_income))
  )

# Imputed rows
net_income_imputed <- main %>%
  filter(imputed_flag_net_income == 1) %>%
  summarise(
    Mean = mean(net_income, na.rm = TRUE),
    Median = median(net_income, na.rm = TRUE),
    Min = min(net_income, na.rm = TRUE),
    Max = max(net_income, na.rm = TRUE),
    Count = sum(!is.na(net_income))
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(net_income, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-imputed rows
net_income_non_imputed <- main %>%
  filter(imputed_flag_net_income == 0) %>%
  summarise(
    Mean = mean(net_income, na.rm = TRUE),
    Median = median(net_income, na.rm = TRUE),
    Min = min(net_income, na.rm = TRUE),
    Max = max(net_income, na.rm = TRUE),
    Count = sum(!is.na(net_income))
  )

# Summary Statistics for Work Time Weekly
# All rows
work_time_all <- main %>%
  summarise(
    Mean = mean(work_time_weekly, na.rm = TRUE),
    Median = median(work_time_weekly, na.rm = TRUE),
    Min = min(work_time_weekly, na.rm = TRUE),
    Max = max(work_time_weekly, na.rm = TRUE),
    Count = sum(!is.na(work_time_weekly))
  )

# Imputed rows
work_time_imputed <- main %>%
  filter(imputed_flag_work_time == 1) %>%
  summarise(
    Mean = mean(work_time_weekly, na.rm = TRUE),
    Median = median(work_time_weekly, na.rm = TRUE),
    Min = min(work_time_weekly, na.rm = TRUE),
    Max = max(work_time_weekly, na.rm = TRUE),
    Count = sum(!is.na(work_time_weekly))
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(work_time_weekly, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-imputed rows
work_time_non_imputed <- main %>%
  filter(imputed_flag_work_time == 0) %>%
  summarise(
    Mean = mean(work_time_weekly, na.rm = TRUE),
    Median = median(work_time_weekly, na.rm = TRUE),
    Min = min(work_time_weekly, na.rm = TRUE),
    Max = max(work_time_weekly, na.rm = TRUE),
    Count = sum(!is.na(work_time_weekly))
  )

# Display Results
print("Net Income Summary Statistics - All Rows:")
## [1] "Net Income Summary Statistics - All Rows:"
print(net_income_all)
## # A tibble: 1 × 5
##    Mean Median   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <int>
## 1 1654.   1430     0 24000  4757
print("Net Income Summary Statistics - Imputed Rows:")
## [1] "Net Income Summary Statistics - Imputed Rows:"
print(net_income_imputed)
## # A tibble: 1 × 5
##    Mean Median   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <int>
## 1   NaN     NA   Inf  -Inf     0
print("Net Income Summary Statistics - Non-Imputed Rows:")
## [1] "Net Income Summary Statistics - Non-Imputed Rows:"
print(net_income_non_imputed)
## # A tibble: 1 × 5
##    Mean Median   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <int>
## 1 1654.   1430     0 24000  4757
print("\nWork Time Weekly Summary Statistics - All Rows:")
## [1] "\nWork Time Weekly Summary Statistics - All Rows:"
print(work_time_all)
## # A tibble: 1 × 5
##    Mean Median   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <int>
## 1  36.6     40     1    80  4757
print("Work Time Weekly Summary Statistics - Imputed Rows:")
## [1] "Work Time Weekly Summary Statistics - Imputed Rows:"
print(work_time_imputed)
## # A tibble: 1 × 5
##    Mean Median   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <int>
## 1   NaN     NA   Inf  -Inf     0
print("Work Time Weekly Summary Statistics - Non-Imputed Rows:")
## [1] "Work Time Weekly Summary Statistics - Non-Imputed Rows:"
print(work_time_non_imputed)
## # A tibble: 1 × 5
##    Mean Median   Min   Max Count
##   <dbl>  <dbl> <dbl> <dbl> <int>
## 1  36.6     40     1    80  4757
# Plot for Net Income - All Rows
ggplot(main, aes(x = net_income)) +
  geom_histogram(binwidth = 500, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(
    title = "Net Income Distribution - All Rows",
    x = "Net Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Plot for Net Income - Imputed Rows
ggplot(main %>% filter(imputed_flag_net_income == 1), aes(x = net_income)) +
  geom_histogram(binwidth = 500, fill = "orange", color = "black", alpha = 0.7) +
  labs(
    title = "Net Income Distribution - Imputed Rows",
    x = "Net Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Plot for Net Income - Non-Imputed Rows
ggplot(main %>% filter(imputed_flag_net_income == 0), aes(x = net_income)) +
  geom_histogram(binwidth = 500, fill = "green", color = "black", alpha = 0.7) +
  labs(
    title = "Net Income Distribution - Non-Imputed Rows",
    x = "Net Income",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Plot for Work Time Weekly - All Rows
ggplot(main, aes(x = work_time_weekly)) +
  geom_histogram(binwidth = 5, fill = "lightcoral", color = "black", alpha = 0.7) +
  labs(
    title = "Work Time Weekly Distribution - All Rows",
    x = "Work Time (Hours/Week)",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Plot for Work Time Weekly - Imputed Rows
ggplot(main %>% filter(imputed_flag_work_time == 1), aes(x = work_time_weekly)) +
  geom_histogram(binwidth = 5, fill = "gold", color = "black", alpha = 0.7) +
  labs(
    title = "Work Time Weekly Distribution - Imputed Rows",
    x = "Work Time (Hours/Week)",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Plot for Work Time Weekly - Non-Imputed Rows
ggplot(main %>% filter(imputed_flag_work_time == 0), aes(x = work_time_weekly)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(
    title = "Work Time Weekly Distribution - Non-Imputed Rows",
    x = "Work Time (Hours/Week)",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

4.4 Unemployment

# Transform total years unemployed (pgexpue)
# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
  summarise(
    NA_count = sum(is.na(pgexpue)),
    not_asked = sum(pgexpue %in% c(-8, -5), na.rm = TRUE),
    other_neg = sum(pgexpue < 0 & !(pgexpue %in% c(-8, -5)), na.rm = TRUE)
  )

print("Breakdown of missing values before imputation:")
## [1] "Breakdown of missing values before imputation:"
print(missing_breakdown_before)
## # A tibble: 1 × 3
##   NA_count not_asked other_neg
##      <int>     <int>     <int>
## 1        0         0        46
# Step 2: Impute Total Years Unemployment with Year Difference Logic and Max 2-Year Lag/Lead
main <- main %>%
  arrange(pid, syear) %>%
  group_by(pid) %>%
  mutate(
    total_years_unemployment = ifelse(pgexpue %in% c(-5, -8), NA_real_, pgexpue),
    total_years_unemployment = {
      temp <- total_years_unemployment
      years <- syear
      for (i in seq_len(n())) {
        if (is.na(temp[i])) {
          lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
          lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
          
          if (length(lag_idx) > 0 && length(lead_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            lead_diff <- years[lead_idx[1]] - years[i]
            nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
            temp[i] <- temp[nearest] - abs(years[i] - years[nearest])  # Adjust for year difference
          } else if (length(lag_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            temp[i] <- temp[lag_idx[length(lag_idx)]] - lag_diff
          } else if (length(lead_idx) > 0) {
            lead_diff <- years[lead_idx[1]] - years[i]
            temp[i] <- temp[lead_idx[1]] - lead_diff
          }
        }
      }
      temp
    },
    total_years_unemployment = ifelse(is.na(total_years_unemployment) & pgexpue == -2, 0, total_years_unemployment)
  ) %>%
  ungroup() %>%
  mutate(
    total_years_unemployment = ifelse(total_years_unemployment < 0, NA_real_, total_years_unemployment),  # Remove negatives
    imputed_flag = if_else(is.na(pgexpue) | pgexpue %in% c(-8, -5), 1, 0)  # Track imputed rows
  )

# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
  summarise(
    NA_count_after = sum(is.na(total_years_unemployment)),
    imputed_count = sum(imputed_flag)
  )

print("Breakdown of missing values after imputation:")
## [1] "Breakdown of missing values after imputation:"
print(missing_breakdown_after)
## # A tibble: 1 × 2
##   NA_count_after imputed_count
##            <int>         <dbl>
## 1             46             0
# Step 4: Descriptive Statistics for Imputed and Non-Imputed Rows
# Imputed Rows
imputed_stats <- main %>%
  filter(imputed_flag == 1) %>%
  summarise(
    Mean_Years_Unemployed = mean(total_years_unemployment, na.rm = TRUE),
    Median_Years_Unemployed = median(total_years_unemployment, na.rm = TRUE),
    Q1 = quantile(total_years_unemployment, 0.25, na.rm = TRUE),
    Q3 = quantile(total_years_unemployment, 0.75, na.rm = TRUE),
    Min = min(total_years_unemployment, na.rm = TRUE),
    Max = max(total_years_unemployment, na.rm = TRUE),
    Count = n()
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(total_years_unemployment, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-Imputed Rows
non_imputed_stats <- main %>%
  filter(imputed_flag == 0) %>%
  summarise(
    Mean_Years_Unemployed = mean(total_years_unemployment, na.rm = TRUE),
    Median_Years_Unemployed = median(total_years_unemployment, na.rm = TRUE),
    Q1 = quantile(total_years_unemployment, 0.25, na.rm = TRUE),
    Q3 = quantile(total_years_unemployment, 0.75, na.rm = TRUE),
    Min = min(total_years_unemployment, na.rm = TRUE),
    Max = max(total_years_unemployment, na.rm = TRUE),
    Count = n()
  )

print("Descriptive Statistics for Imputed Rows:")
## [1] "Descriptive Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 7
##   Mean_Years_Unemployed Median_Years_Unemployed    Q1    Q3   Min   Max Count
##                   <dbl>                   <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1                   NaN                      NA    NA    NA   Inf  -Inf     0
print("Descriptive Statistics for Non-Imputed Rows:")
## [1] "Descriptive Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 7
##   Mean_Years_Unemployed Median_Years_Unemployed    Q1    Q3   Min   Max Count
##                   <dbl>                   <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1                 0.707                       0     0   0.5     0  27.2  4757
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = total_years_unemployment)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(
    title = "Distribution of Total Years Unemployed - All Rows",
    x = "Total Years Unemployed",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = total_years_unemployment)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(
    title = "Distribution of Total Years Unemployed - Imputed Rows",
    x = "Total Years Unemployed",
    y = "Count"
  ) +
  theme_minimal()

# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = total_years_unemployment)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  labs(
    title = "Distribution of Total Years Unemployed - Non-Imputed Rows",
    x = "Total Years Unemployed",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_bin()`).

4.5 Occupational Status

# Transform occupational status (pgstib)
main <- main %>%
  mutate(
    occup_status = case_when(
      between(pgstib, 210, 250) ~ 1,  # Blue collars
      between(pgstib, 510, 550) ~ 2,  # White collars
      between(pgstib, 610, 640) | pgstib == 15 ~ 3,  # Public servant
      between(pgstib, 410, 440) ~ 4,  # Self-employed
      pgstib == 12 ~ 5,  # Unemployed
      pgstib == 10 ~ 6,  # Non-working
      between(pgstib, 110, 150) | pgstib %in% c(11, 13) ~ 7,  # Apprentice, retired
      TRUE ~ 8  # Assign a default category for unmatched values
    )
  ) %>%
  # Drop the original `pgstib` column
  select(-pgstib)

# Add labels to `occup_status`, including the new "other" category
occup_status_labels <- c(
  "1" = "blue collar",
  "2" = "white collar",
  "3" = "public servant",
  "4" = "self-employed",
  "5" = "unemployed",
  "6" = "non-working",
  "7" = "apprentice, retired",
  "8" = "other"
)

main$occup_status <- factor(main$occup_status, levels = 1:8, labels = occup_status_labels)
#Visualisation: Bar Plot
ggplot(main, aes(x = occup_status)) +
  geom_bar(fill = "darkred", color = "black") +
  labs(
    title = "Distribution of Occupational Status",
    x = "Occupational Status",
    y = "Count"
  ) +
  theme_minimal()

4.6 Gender

# Transform sex into male dummy
main <- main %>%
  mutate(
    male = case_when(
      sex == 1 ~ 1,
      sex == 2 ~ 0,
      TRUE ~ NA_real_
    )
  ) %>%
  group_by(pid) %>%
  mutate(male = if_else(is.na(male), first(na.omit(male)), male)) %>%
  ungroup() %>%
  drop_na(male) %>%
  select(-sex)
#Visualisation: Pie Chart
main %>%
  group_by(male) %>%
  summarize(count = n()) %>%
  mutate(
    proportion = count / sum(count),
    label = paste0(round(proportion * 100, 1), "%")
  ) %>%
  ggplot(aes(x = "", y = proportion, fill = factor(male, labels = c("Female", "Male")))) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(
    title = "Proportion of Male vs. Female",
    fill = "Gender"
  ) +
  theme_minimal() +
  geom_text(aes(label = label), position = position_stack(vjust = 0.5))

4.7 Migration Background

# Transform migration background and create dummy
main <- main %>%
  mutate(
    migback = case_when(
      migback == -2 ~ 1,
      migback < 0 ~ NA_real_,
      TRUE ~ migback
    )
  ) %>%
  group_by(pid) %>%
  mutate(migback = if_else(is.na(migback), first(na.omit(migback)), migback)) %>%
  ungroup() %>%
  drop_na(migback) %>%
  mutate(
    migback = factor(
      migback,
      levels = c(1, 2, 3),
      labels = c("no migration", "direct migration", "indirect migration")
    ),
    migback_dummy = ifelse(migback %in% c("direct migration", "indirect migration"), 1, 0)
  )
#Visualisation: Bar plot
ggplot(main, aes(x = migback)) +
  geom_bar(fill = "cyan", color = "black") +
  labs(
    title = "Distribution of Migration Background",
    x = "Migration Background",
    y = "Count"
  ) +
  theme_minimal()

4.8 Marital Status

# Count the number of NA rows before imputation
na_count_before <- sum(is.na(main$pgfamstd_dummy))
## Warning: Unknown or uninitialised column: `pgfamstd_dummy`.
print(paste("Number of NA rows in pgfamstd_dummy before imputation:", na_count_before))
## [1] "Number of NA rows in pgfamstd_dummy before imputation: 0"
# Transform marital status and handle NA with lag/lead logic
main <- main %>%
  arrange(pid, syear) %>%  # Ensure data is ordered by pid and syear
  group_by(pid) %>%
  mutate(
    pgfamstd = case_when(
      pgfamstd < 0 ~ NA_real_,       # Replace negative values with NA
      TRUE ~ pgfamstd                # Keep the original value otherwise
    ),
    pgfamstd_dummy = case_when(
      pgfamstd %in% c(1, 6, 7, 8) ~ "married",
      pgfamstd %in% c(2, 3, 4, 5) ~ "single",
      TRUE ~ NA_character_          # Handle unexpected missings
    ),
    # Impute NA values using lag and lead logic
    pgfamstd_dummy = {
      temp <- pgfamstd_dummy
      years <- syear
      for (i in seq_len(n())) {
        if (is.na(temp[i])) {
          lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
          lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
          
          if (length(lag_idx) > 0 && length(lead_idx) > 0) {
            lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
            lead_diff <- years[lead_idx[1]] - years[i]
            nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
            temp[i] <- temp[nearest]
          } else if (length(lag_idx) > 0) {
            temp[i] <- temp[lag_idx[length(lag_idx)]]
          } else if (length(lead_idx) > 0) {
            temp[i] <- temp[lead_idx[1]]
          }
        }
      }
      temp
    }
  ) %>%
  ungroup() %>%
  mutate(
    imputed_flag = if_else(is.na(pgfamstd) | pgfamstd < 0, 1, 0)  # Track imputed rows
  )

# Drop all rows with NA in `pgfamstd_dummy`
main <- main %>%
  filter(!is.na(pgfamstd_dummy))

# Summary Statistics
# All Rows
all_stats <- main %>%
  summarise(
    Married = sum(pgfamstd_dummy == "married", na.rm = TRUE),
    Single = sum(pgfamstd_dummy == "single", na.rm = TRUE),
    Total = n()
  )

# Imputed Rows
imputed_stats <- main %>%
  filter(imputed_flag == 1) %>%
  summarise(
    Married = sum(pgfamstd_dummy == "married", na.rm = TRUE),
    Single = sum(pgfamstd_dummy == "single", na.rm = TRUE),
    Total = n()
  )

# Non-Imputed Rows
non_imputed_stats <- main %>%
  filter(imputed_flag == 0) %>%
  summarise(
    Married = sum(pgfamstd_dummy == "married", na.rm = TRUE),
    Single = sum(pgfamstd_dummy == "single", na.rm = TRUE),
    Total = n()
  )

print("Summary Statistics for All Rows:")
## [1] "Summary Statistics for All Rows:"
print(all_stats)
## # A tibble: 1 × 3
##   Married Single Total
##     <int>  <int> <int>
## 1    2903   1829  4732
print("Summary Statistics for Imputed Rows:")
## [1] "Summary Statistics for Imputed Rows:"
print(imputed_stats)
## # A tibble: 1 × 3
##   Married Single Total
##     <int>  <int> <int>
## 1       1      1     2
print("Summary Statistics for Non-Imputed Rows:")
## [1] "Summary Statistics for Non-Imputed Rows:"
print(non_imputed_stats)
## # A tibble: 1 × 3
##   Married Single Total
##     <int>  <int> <int>
## 1    2902   1828  4730
# Visualizations
# All Rows
ggplot(main, aes(x = factor(pgfamstd_dummy))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Distribution of Family Status - All Rows",
    x = "Family Status",
    y = "Count"
  ) +
  theme_minimal()

# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = factor(pgfamstd_dummy))) +
  geom_bar(fill = "orange", color = "black") +
  labs(
    title = "Distribution of Family Status - Imputed Rows",
    x = "Family Status",
    y = "Count"
  ) +
  theme_minimal()

# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = factor(pgfamstd_dummy))) +
  geom_bar(fill = "green", color = "black") +
  labs(
    title = "Distribution of Family Status - Non-Imputed Rows",
    x = "Family Status",
    y = "Count"
  ) +
  theme_minimal()

4.8 Born In Germany

main <- main %>%
  mutate(
    germborn = case_when(
      germborn == 2 ~ 0,
      germborn < 0 ~ NA_real_,
      TRUE ~ germborn
    )
  ) %>%
  group_by(pid) %>%
  mutate(germborn = if_else(is.na(germborn), first(na.omit(germborn)), germborn)) %>%
  ungroup() %>%
  drop_na(germborn)
#Visualisation: Bar Plot
ggplot(main, aes(x = factor(germborn, labels = c("Not Born in Germany", "Born in Germany")))) +
  geom_bar(fill = "pink", color = "black") +
  labs(
    title = "Proportion of Individuals Born in Germany",
    x = "Born in Germany",
    y = "Count"
  ) +
  theme_minimal()

4.9 Age: Filtering for age between 25 and 55

# Create age and filter age range
main <- main %>%
  arrange(pid, syear) %>%  # Ensure data is sorted by pid and syear
  group_by(pid) %>%
  mutate(
    # Impute `ple0010_h` using the closest valid value in the same group
    ple0010_h = if_else(
      is.na(ple0010_h),
      ple0010_h[!is.na(ple0010_h)] + (syear - syear[!is.na(ple0010_h)]),  # Calculate from closest year
      ple0010_h
    )
  ) %>%
  ungroup() %>%
  mutate(
    age = syear - ple0010_h  # Calculate age based on imputed or existing `ple0010_h`
  ) %>%
  filter(!is.na(ple0010_h) & ple0010_h > 0) %>%  # Filter out invalid entries
  filter(age >= 25 & age <= 55) %>%  # Keep only rows within the age range
  select(-ple0010_h)  # Drop `ple0010_h` after processing

# Visualisation: Histogram
ggplot(main, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "gold", color = "black") +
  labs(
    title = "Distribution of Age (25-55 Years)",
    x = "Age",
    y = "Count"
  ) +
  theme_minimal()

Final Cleanup: Keeping relevant variables

print(colnames(main))
##   [1] "pid"                        "syear"                     
##   [3] "health"                     "worr_health_categorical"   
##   [5] "ple0055"                    "ple0056"                   
##   [7] "ple0072"                    "ple0081_h"                 
##   [9] "ple0086_v4"                 "ple0080_v2"                
##  [11] "ple0086_v1"                 "ple0011"                   
##  [13] "ple0012"                    "ple0013"                   
##  [15] "ple0014"                    "ple0015"                   
##  [17] "ple0016"                    "ple0017"                   
##  [19] "ple0018"                    "ple0019"                   
##  [21] "ple0020"                    "ple0021"                   
##  [23] "ple0022"                    "ple0023"                   
##  [25] "ple0004"                    "ple0005"                   
##  [27] "plh0182"                    "plh0171"                   
##  [29] "plh0042"                    "plh0033"                   
##  [31] "plh0032"                    "ple0006"                   
##  [33] "ple0007"                    "pgerwzeit"                 
##  [35] "pgemplst"                   "pglabnet"                  
##  [37] "pgtatzeit"                  "pgexpue"                   
##  [39] "pgfamstd"                   "hid"                       
##  [41] "psample"                    "gebjahr"                   
##  [43] "migback"                    "germborn"                  
##  [45] "worr_health_dummy"          "past_health"               
##  [47] "health_diff"                "health_in_2yrs"            
##  [49] "health_decline_2yrs"        "gs01"                      
##  [51] "gripr1"                     "gripr2"                    
##  [53] "gripl1"                     "gripl2"                    
##  [55] "avg_gripr"                  "avg_gripl"                 
##  [57] "avg_grip"                   "ple0055_flag"              
##  [59] "ple0056_flag"               "imputed_flag"              
##  [61] "hospital_stay"              "ple0004_imputed"           
##  [63] "ple0005_imputed"            "impaired_stairs"           
##  [65] "impaired_tiring_tasks"      "ple0011_imputed"           
##  [67] "ple0012_imputed"            "ple0013_imputed"           
##  [69] "ple0014_imputed"            "ple0015_imputed"           
##  [71] "ple0016_imputed"            "ple0017_imputed"           
##  [73] "ple0018_imputed"            "ple0019_imputed"           
##  [75] "ple0020_imputed"            "ple0021_imputed"           
##  [77] "ple0022_imputed"            "ple0023_imputed"           
##  [79] "illnesses"                  "imputed_flag_life"         
##  [81] "imputed_flag_health"        "life_satisfaction"         
##  [83] "health_satisfaction"        "height_imputed"            
##  [85] "height"                     "weight_imputed"            
##  [87] "weight"                     "BMI"                       
##  [89] "bmi_categorical"            "smoking_imputed"           
##  [91] "smoking_dummy"              "cigarettes_imputed"        
##  [93] "nb_of_cigarettes"           "diet_categorical"          
##  [95] "diet_dummy"                 "worries_job_imputed"       
##  [97] "worries_financial_imputed"  "worries_economic_imputed"  
##  [99] "worr_job_categorical"       "worr_financial_categorical"
## [101] "worr_economic_categorical"  "worr_job_dummy"            
## [103] "worr_financial_dummy"       "worr_economic_dummy"       
## [105] "tenure"                     "net_income"                
## [107] "imputed_flag_net_income"    "work_time_weekly"          
## [109] "imputed_flag_work_time"     "total_years_unemployment"  
## [111] "occup_status"               "male"                      
## [113] "migback_dummy"              "pgfamstd_dummy"            
## [115] "age"
# Select only the relevant variables
main <- main %>%
  select(
    pid, syear,  # Unique identifiers
    health, worr_health_categorical, worr_health_dummy, 
    migback, germborn, hospital_stay, smoking_dummy, nb_of_cigarettes, illnesses, 
    impaired_stairs, impaired_tiring_tasks, diet_categorical, diet_dummy, avg_grip, 
    life_satisfaction, health_satisfaction,  
    height, weight, BMI, bmi_categorical, worr_job_categorical, worr_financial_categorical,
    worr_economic_categorical, worr_job_dummy, worr_financial_dummy, worr_economic_dummy, 
    tenure, net_income, work_time_weekly, total_years_unemployment, 
    occup_status, male, migback_dummy, age
  )


#Display the summary of the dataset
skim(main)
Data summary
Name main
Number of rows 3336
Number of columns 36
_______________________
Column type frequency:
factor 8
numeric 28
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
worr_health_categorical 0 1.00 FALSE 3 wor: 1815, wor: 833, no : 688
migback 0 1.00 FALSE 3 no : 2673, dir: 498, ind: 165
diet_categorical 1437 0.57 FALSE 4 som: 875, som: 759, ver: 160, ver: 105
bmi_categorical 32 0.99 FALSE 4 Nor: 1414, Ove: 1151, Obe: 684, Und: 55
worr_job_categorical 0 1.00 FALSE 3 no : 1684, wor: 1181, wor: 471
worr_financial_categorical 0 1.00 FALSE 3 wor: 1784, no : 838, wor: 714
worr_economic_categorical 0 1.00 FALSE 3 wor: 1999, wor: 843, no : 494
occup_status 0 1.00 FALSE 6 whi: 1974, blu: 833, sel: 245, pub: 242

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
pid 0 1.00 10398312.66 11303900.38 5203.00 2303352.00 5028151.00 20377276.00 35031602.00 ▇▁▁▁▂
syear 0 1.00 2010.65 5.48 2000.00 2006.00 2012.00 2016.00 2018.00 ▃▆▅▆▇
health 0 1.00 3.17 0.96 1.00 3.00 3.00 4.00 5.00 ▁▅▇▇▁
worr_health_dummy 0 1.00 0.79 0.40 0.00 1.00 1.00 1.00 1.00 ▂▁▁▁▇
germborn 0 1.00 0.85 0.36 0.00 1.00 1.00 1.00 1.00 ▂▁▁▁▇
hospital_stay 0 1.00 9.50 12.80 0.00 4.00 6.00 11.00 210.00 ▇▁▁▁▁
smoking_dummy 1 1.00 0.35 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
nb_of_cigarettes 0 1.00 0.12 1.31 0.00 0.00 0.00 0.00 25.00 ▇▁▁▁▁
illnesses 0 1.00 0.00 0.09 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
impaired_stairs 0 1.00 0.40 0.49 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
impaired_tiring_tasks 0 1.00 0.52 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
diet_dummy 1437 0.57 0.48 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
avg_grip 2719 0.18 389.81 117.90 45.00 301.00 365.00 475.00 775.00 ▁▇▇▅▁
life_satisfaction 12 1.00 6.91 1.83 -1.00 6.00 7.00 8.00 10.00 ▁▁▂▆▇
health_satisfaction 12 1.00 6.04 2.29 -1.00 5.00 6.00 8.00 10.00 ▁▃▆▇▇
height 6 1.00 172.58 9.52 105.00 165.00 172.00 180.00 208.00 ▁▁▃▇▁
weight 32 0.99 79.34 18.18 35.00 65.00 78.00 90.00 215.00 ▅▇▁▁▁
BMI 32 0.99 26.53 5.35 11.05 22.99 25.66 29.24 95.24 ▇▃▁▁▁
worr_job_dummy 0 1.00 0.50 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
worr_financial_dummy 0 1.00 0.75 0.43 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
worr_economic_dummy 0 1.00 0.85 0.36 0.00 1.00 1.00 1.00 1.00 ▂▁▁▁▇
tenure 1 1.00 10.09 8.98 0.00 2.50 7.33 15.71 40.17 ▇▃▂▁▁
net_income 0 1.00 1672.95 1282.51 0.00 892.75 1500.00 2100.00 24000.00 ▇▁▁▁▁
work_time_weekly 0 1.00 36.73 13.31 1.00 30.00 40.00 45.00 80.00 ▂▃▇▂▁
total_years_unemployment 36 0.99 0.75 2.01 0.00 0.00 0.00 0.50 23.08 ▇▁▁▁▁
male 0 1.00 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
migback_dummy 0 1.00 0.20 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
age 0 1.00 42.46 8.40 25.00 36.00 44.00 50.00 55.00 ▅▅▆▇▇

Saving the cleaned dataset

# Save the final main dataset as a CSV file
write.csv(main, "SOEP_main_final.csv", row.names = FALSE)